import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
data = pd.read_csv('vehicle.csv')
data.head()
data.describe().T
data.corr()
sns.pairplot(data,diag_kind='kde')
data.isna().sum()
null = data[data.isna().any(axis=1)]
null.head()
data['circularity'].replace(to_replace = np.NaN,value = data['circularity'].mean(), inplace = True)
data['distance_circularity'].replace(to_replace = np.NaN,value = data['distance_circularity'].mean(), inplace = True)
data['radius_ratio'].replace(to_replace = np.NaN,value = data['radius_ratio'].mean(), inplace = True)
data['pr.axis_aspect_ratio'].replace(to_replace = np.NaN,value = data['pr.axis_aspect_ratio'].mean(), inplace = True)
data['scatter_ratio'].replace(to_replace = np.NaN,value = data['scatter_ratio'].mean(), inplace = True)
data['elongatedness'].replace(to_replace = np.NaN,value = data['elongatedness'].mean(), inplace = True)
data['pr.axis_rectangularity'].replace(to_replace = np.NaN,value = data['pr.axis_rectangularity'].mean(), inplace = True)
data['scaled_variance'].replace(to_replace = np.NaN,value = data['scaled_variance'].mean(), inplace = True)
data['scaled_variance.1'].replace(to_replace = np.NaN,value = data['scaled_variance.1'].mean(), inplace = True)
data['scaled_radius_of_gyration'].replace(to_replace = np.NaN,value = data['scaled_radius_of_gyration'].mean(), inplace = True)
data['scaled_radius_of_gyration.1'].replace(to_replace = np.NaN,value = data['scaled_radius_of_gyration.1'].mean(), inplace = True)
data['skewness_about'].replace(to_replace = np.NaN,value = data['skewness_about'].mean(), inplace = True)
data['skewness_about.2'].replace(to_replace = np.NaN,value = data['skewness_about.2'].mean(), inplace = True)
data['skewness_about.1'].replace(to_replace = np.NaN,value = data['skewness_about.1'].mean(), inplace = True)
data.info()
data.boxplot(figsize=(40,20))
#It's showing outliers in different veriables in data
radius_ratio
pr.axis_aspect_ratio
max.length_aspect_ratio
scaled_variance
scaled_variance.1
scaled_radius_of_gyration.1
skewness_about
skewness_about.1
data['skewness_about'].max()
pd.crosstab(data['max.length_aspect_ratio'],data['class'])
pd.crosstab(data['radius_ratio'],data['class'])
pd.crosstab(data['pr.axis_aspect_ratio'],data['class'])
pd.crosstab(data['scaled_variance'],data['class'])
pd.crosstab(data['scaled_variance.1'],data['class'])
#pd.crosstab(data['scaled_radius_of_gyration.1'],data['class'])#87
#pd.crosstab(data['skewness_about'],data['class'])#18
#pd.crosstab(data['skewness_about.1'],data['class'])
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
IQR
data = data[~((data < (Q1 - 1.5 * IQR)) |(data > (Q3 + 1.5 * IQR))).any(axis=1)]
data.boxplot(figsize=(40,20))
#Outliers have been removed.
from sklearn.preprocessing import LabelEncoder
labels = LabelEncoder()
data['class'] = labels.fit_transform(data['class'])
data.head()
sns.pairplot(data, diag_kind= 'kde', hue='class')
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(data.corr(),annot=True)
Correlation = data.corr()
indices = np.where((Correlation < 0.1) & (Correlation > -0.1))
indices = [(Correlation.index[x], Correlation.columns[y]) for x, y in zip(*indices) if x != y and x < y]
if len(indices) == 0:
print ("some corrrelation between them.")
else:
indices = np.where((Correlation < 0.1) & (Correlation > -0.1))
indices = [(Correlation.index[x]) for x, y in zip(*indices) if x != y and x < y and y == 18]
if len(indices) == 0:
print ('corrrelation with class column.')
else:
print ("No good correlation between class and other variables: ", indices)
#Scaler
from scipy.stats import zscore
from scipy.stats import zscore
data_new = data.apply(zscore)
PCA
from sklearn import model_selection
columns = ['class','compactness', 'distance_circularity', 'pr.axis_aspect_ratio', 'max.length_rectangularity', 'skewness_about.1', 'skewness_about.2']
X = data_new.drop(columns, axis=1)
y = data_new['class']
test_size = 0.30 # taking 70:30 training and test set
seed = 10 # Random numbmer seeding for reapeatability of the code
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=test_size, random_state=seed)
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(X_train)
Variance = pca.explained_variance_ratio_
singular_values = pca.singular_values_
print(Variance)
singular_values
total = sum(Variance)
var_exp = [( i /total ) * 100 for i in sorted(Variance, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
print("Cumulative Variance Explained", cum_var_exp)
plt.figure(figsize=(10 , 5))
plt.bar(range(1, Variance.size + 1), var_exp, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, Variance.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()
fig = plt.figure(figsize=(10,10))
plt.plot(range(1, Variance.size + 1), var_exp, 'bo-', linewidth=2)
plt.title('Elbow Plot')
plt.xlabel('Principal Component')
plt.ylabel('Explained Varience Ratio')
plt.show()
pca = PCA(n_components=.98)
pca.fit(X_train)
X_train_new = pca.transform(X_train)
X_test_new = pca.transform(X_test)
Proj_data_Train = pd.DataFrame(X_train_new)
Proj_data_Train = Proj_data_Train.join(y_train)
plt.subplots(figsize=(10,10))
sns.heatmap(Proj_data_Train.corr(), annot=True, linewidths=1)
SVM
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train_new, y_train)
svc.score(X_test_new, y_test)
import multiprocessing
from sklearn.model_selection import GridSearchCV
param_grid = [{'kernel': ['linear', 'rbf'], 'C': [0.01, 0.05, 0.5, 1]}]
g_s = GridSearchCV(estimator=SVC(), param_grid=param_grid,scoring='accuracy', cv=10, n_jobs=multiprocessing.cpu_count())
g_s.fit(X_train_new, y_train)
g_s.best_estimator_
g_s.best_score_
#here we see that by grid search, we get C=1 and kernel =rbf as the best fit.
svc = SVC(C=1, kernel='rbf')
svc.fit(X_train_new, y_train)
print("Accuracy on training set:",format(svc.score(X_train_pca, y_train)))
print("Accuracy on test set:",format(svc.score(X_test_pca, y_test)))
from sklearn.model_selection import cross_val_score
scoresTrain = cross_val_score(svc, X_train_new, y_train, cv=10)
print("Train Accuracy:",(scoresTrain.mean()))
scoresTest = cross_val_score(svc, X_test_new, y_test, cv=10)
print("Test Accuracy:",(scoresTest.mean()))